InĀ [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="darkgrid")
InĀ [2]:
df=pd.read_csv('train_values.csv')
InĀ [3]:
df_labels=pd.read_csv('train_labels.csv')
InĀ [4]:
df = pd.merge(df, df_labels, on='building_id')
InĀ [5]:
df.head()
Out[5]:
| building_id | geo_level_1_id | geo_level_2_id | geo_level_3_id | count_floors_pre_eq | age | area_percentage | height_percentage | land_surface_condition | foundation_type | ... | has_secondary_use_hotel | has_secondary_use_rental | has_secondary_use_institution | has_secondary_use_school | has_secondary_use_industry | has_secondary_use_health_post | has_secondary_use_gov_office | has_secondary_use_use_police | has_secondary_use_other | damage_grade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 802906 | 6 | 487 | 12198 | 2 | 30 | 6 | 5 | t | r | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
| 1 | 28830 | 8 | 900 | 2812 | 2 | 10 | 8 | 7 | o | r | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| 2 | 94947 | 21 | 363 | 8973 | 2 | 10 | 5 | 5 | t | r | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
| 3 | 590882 | 22 | 418 | 10694 | 2 | 10 | 6 | 5 | t | r | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| 4 | 201944 | 11 | 131 | 1488 | 3 | 30 | 8 | 9 | t | r | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
5 rows Ć 40 columns
InĀ [6]:
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 260601 entries, 0 to 260600 Data columns (total 40 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 building_id 260601 non-null int64 1 geo_level_1_id 260601 non-null int64 2 geo_level_2_id 260601 non-null int64 3 geo_level_3_id 260601 non-null int64 4 count_floors_pre_eq 260601 non-null int64 5 age 260601 non-null int64 6 area_percentage 260601 non-null int64 7 height_percentage 260601 non-null int64 8 land_surface_condition 260601 non-null object 9 foundation_type 260601 non-null object 10 roof_type 260601 non-null object 11 ground_floor_type 260601 non-null object 12 other_floor_type 260601 non-null object 13 position 260601 non-null object 14 plan_configuration 260601 non-null object 15 has_superstructure_adobe_mud 260601 non-null int64 16 has_superstructure_mud_mortar_stone 260601 non-null int64 17 has_superstructure_stone_flag 260601 non-null int64 18 has_superstructure_cement_mortar_stone 260601 non-null int64 19 has_superstructure_mud_mortar_brick 260601 non-null int64 20 has_superstructure_cement_mortar_brick 260601 non-null int64 21 has_superstructure_timber 260601 non-null int64 22 has_superstructure_bamboo 260601 non-null int64 23 has_superstructure_rc_non_engineered 260601 non-null int64 24 has_superstructure_rc_engineered 260601 non-null int64 25 has_superstructure_other 260601 non-null int64 26 legal_ownership_status 260601 non-null object 27 count_families 260601 non-null int64 28 has_secondary_use 260601 non-null int64 29 has_secondary_use_agriculture 260601 non-null int64 30 has_secondary_use_hotel 260601 non-null int64 31 has_secondary_use_rental 260601 non-null int64 32 has_secondary_use_institution 260601 non-null int64 33 has_secondary_use_school 260601 non-null int64 34 has_secondary_use_industry 260601 non-null int64 35 has_secondary_use_health_post 260601 non-null int64 36 has_secondary_use_gov_office 260601 non-null int64 37 has_secondary_use_use_police 260601 non-null int64 38 has_secondary_use_other 260601 non-null int64 39 damage_grade 260601 non-null int64 dtypes: int64(32), object(8) memory usage: 79.5+ MB
InĀ [7]:
df.describe()
Out[7]:
| building_id | geo_level_1_id | geo_level_2_id | geo_level_3_id | count_floors_pre_eq | age | area_percentage | height_percentage | has_superstructure_adobe_mud | has_superstructure_mud_mortar_stone | ... | has_secondary_use_hotel | has_secondary_use_rental | has_secondary_use_institution | has_secondary_use_school | has_secondary_use_industry | has_secondary_use_health_post | has_secondary_use_gov_office | has_secondary_use_use_police | has_secondary_use_other | damage_grade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 2.606010e+05 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | ... | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 | 260601.000000 |
| mean | 5.256755e+05 | 13.900353 | 701.074685 | 6257.876148 | 2.129723 | 26.535029 | 8.018051 | 5.434365 | 0.088645 | 0.761935 | ... | 0.033626 | 0.008101 | 0.000940 | 0.000361 | 0.001071 | 0.000188 | 0.000146 | 0.000088 | 0.005119 | 2.238272 |
| std | 3.045450e+05 | 8.033617 | 412.710734 | 3646.369645 | 0.727665 | 73.565937 | 4.392231 | 1.918418 | 0.284231 | 0.425900 | ... | 0.180265 | 0.089638 | 0.030647 | 0.018989 | 0.032703 | 0.013711 | 0.012075 | 0.009394 | 0.071364 | 0.611814 |
| min | 4.000000e+00 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 1.000000 | 2.000000 | 0.000000 | 0.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 2.611900e+05 | 7.000000 | 350.000000 | 3073.000000 | 2.000000 | 10.000000 | 5.000000 | 4.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 |
| 50% | 5.257570e+05 | 12.000000 | 702.000000 | 6270.000000 | 2.000000 | 15.000000 | 7.000000 | 5.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 |
| 75% | 7.897620e+05 | 21.000000 | 1050.000000 | 9412.000000 | 2.000000 | 30.000000 | 9.000000 | 6.000000 | 0.000000 | 1.000000 | ... | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 3.000000 |
| max | 1.052934e+06 | 30.000000 | 1427.000000 | 12567.000000 | 9.000000 | 995.000000 | 100.000000 | 32.000000 | 1.000000 | 1.000000 | ... | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 3.000000 |
8 rows Ć 32 columns
InĀ [8]:
df.drop('building_id', axis=1,inplace =True)
InĀ [9]:
df.head()
Out[9]:
| geo_level_1_id | geo_level_2_id | geo_level_3_id | count_floors_pre_eq | age | area_percentage | height_percentage | land_surface_condition | foundation_type | roof_type | ... | has_secondary_use_hotel | has_secondary_use_rental | has_secondary_use_institution | has_secondary_use_school | has_secondary_use_industry | has_secondary_use_health_post | has_secondary_use_gov_office | has_secondary_use_use_police | has_secondary_use_other | damage_grade | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 6 | 487 | 12198 | 2 | 30 | 6 | 5 | t | r | n | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
| 1 | 8 | 900 | 2812 | 2 | 10 | 8 | 7 | o | r | n | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| 2 | 21 | 363 | 8973 | 2 | 10 | 5 | 5 | t | r | n | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
| 3 | 22 | 418 | 10694 | 2 | 10 | 6 | 5 | t | r | n | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 2 |
| 4 | 11 | 131 | 1488 | 3 | 30 | 8 | 9 | t | r | n | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 3 |
5 rows Ć 39 columns
InĀ [10]:
df['age'].unique()
Out[10]:
array([ 30, 10, 25, 0, 15, 20, 45, 55, 5, 40, 80, 60, 35,
70, 50, 65, 100, 75, 85, 190, 995, 105, 90, 120, 95, 110,
115, 150, 200, 130, 125, 140, 155, 160, 175, 135, 145, 195, 180,
165, 170, 185], dtype=int64)
InĀ [11]:
df['age'].value_counts()
Out[11]:
age 10 38896 15 36010 5 33697 20 32182 0 26041 25 24366 30 18028 35 10710 40 10559 50 7257 45 4711 60 3612 80 3055 55 2033 70 1975 995 1390 100 1364 65 1123 90 1085 85 847 75 512 95 414 120 180 150 142 200 106 110 100 105 89 125 37 115 21 130 9 140 9 180 7 160 6 170 6 175 5 135 5 190 3 145 3 195 2 165 2 155 1 185 1 Name: count, dtype: int64
InĀ [12]:
df = df.drop(df[df.age == 995].index)
InĀ [13]:
df['age'].unique()
Out[13]:
array([ 30, 10, 25, 0, 15, 20, 45, 55, 5, 40, 80, 60, 35,
70, 50, 65, 100, 75, 85, 190, 105, 90, 120, 95, 110, 115,
150, 200, 130, 125, 140, 155, 160, 175, 135, 145, 195, 180, 165,
170, 185], dtype=int64)
InĀ [14]:
sns.boxplot(x=df['age'],y=df['count_floors_pre_eq'])
Out[14]:
<Axes: xlabel='age', ylabel='count_floors_pre_eq'>
InĀ [15]:
plt.scatter(df['age'],df['count_floors_pre_eq'])
Out[15]:
<matplotlib.collections.PathCollection at 0x209c1bbc4d0>
InĀ [16]:
sns.relplot(data = df, x = "age", y = "count_floors_pre_eq", size = "damage_grade", sizes = (15, 200))
Out[16]:
<seaborn.axisgrid.FacetGrid at 0x209c04a6690>
InĀ [17]:
#sns.relplot(data = df, kind = "line", x = "age", y = "count_floors_pre_eq", hue = "")
InĀ [18]:
df['has_superstructure_bamboo'].value_counts()
Out[18]:
has_superstructure_bamboo 0 237201 1 22010 Name: count, dtype: int64
InĀ [19]:
df_mud_adobe_mud = (df["has_superstructure_mud_mortar_stone"]==1).sum()
df_adobe_mud = (df["has_superstructure_adobe_mud"]==1).sum()
df_stone_flag = (df["has_superstructure_stone_flag"]==1).sum()
df_cement_mortar_stone = (df["has_superstructure_cement_mortar_stone"]==1).sum()
df_mud_mortar_brick = (df["has_superstructure_mud_mortar_brick"]==1).sum()
df_cement_mortar_brick = (df["has_superstructure_cement_mortar_brick"]==1).sum()
df_timber = (df["has_superstructure_timber"]==1).sum()
df_bamboo = (df["has_superstructure_bamboo"]==1).sum()
df_rc_non_engineered = (df["has_superstructure_rc_non_engineered"]==1).sum()
df_rc_engineered = (df["has_superstructure_rc_engineered"]==1).sum()
df_other = (df["has_superstructure_other"]==1).sum()
InĀ [20]:
#df_adobe_mud, df_bamboo, df_mud_mortar_stone, df_stone_flag, df_cement_mortar_stone, df_cement_mortar_brick, df_timber, df_rc_non_engineered, df_rc_engineered, df_other
InĀ [21]:
df_mud_adobe_mud
Out[21]:
197524
InĀ [22]:
df_bamboo
Out[22]:
22010
InĀ [23]:
superstructure_counts = {
'Mud Mortar Stone': df_mud_adobe_mud,
'Adobe Mud': df_adobe_mud,
'Stone Flag': df_stone_flag,
'Cement Mortar Stone': df_cement_mortar_stone,
'Mud Mortar Brick': df_mud_mortar_brick,
'Cement Mortar Brick': df_cement_mortar_brick,
'Timber': df_timber,
'Bamboo': df_bamboo,
'RC Non-Engineered': df_rc_non_engineered,
'RC Engineered': df_rc_engineered,
'Other': df_other
}
plt.figure(figsize=(12, 6))
plt.bar(superstructure_counts.keys(), superstructure_counts.values(), color='skyblue')
plt.title('Counts of Buildings by Superstructure Type')
plt.xlabel('Superstructure Type')
plt.ylabel('Number of Buildings')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.tight_layout()
plt.savefig("building.png")
plt.show()
InĀ [24]:
data = df[['age', 'count_floors_pre_eq', 'damage_grade']]
grouped_data = data.groupby(['age', 'count_floors_pre_eq']).mean().reset_index()
ages = grouped_data['age']
floors = grouped_data['count_floors_pre_eq']
damage_grade = grouped_data['damage_grade']
plt.figure(figsize=(10, 6))
plt.scatter(ages, floors, s=damage_grade*50, c=damage_grade, cmap='coolwarm', alpha=0.8)
plt.colorbar(label='Damage Grade')
plt.xlabel('Age')
plt.ylabel('Number of Floors')
plt.title('Damage Grade by Age and Number of Floors')
plt.grid(True)
plt.show()
InĀ [25]:
from ydata_profiling import ProfileReport
InĀ [26]:
profile = ProfileReport(df, title='Pandas Profiling Report', explorative=True)
InĀ [27]:
profile
Summarize dataset: 0%| | 0/5 [00:00<?, ?it/s]
Generate report structure: 0%| | 0/1 [00:00<?, ?it/s]
Render HTML: 0%| | 0/1 [00:00<?, ?it/s]
Out[27]:
InĀ [28]:
profile.to_file("output.html")
Export report to file: 0%| | 0/1 [00:00<?, ?it/s]